library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(corrplot)
## corrplot 0.84 loaded
library(readr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
epa.cars <- read_csv("big_epa_cars.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## drive = col_character(),
## eng_dscr = col_character(),
## fuelType = col_character(),
## fuelType1 = col_character(),
## make = col_character(),
## model = col_character(),
## mpgData = col_character(),
## phevBlended = col_logical(),
## trany = col_character(),
## VClass = col_character(),
## guzzler = col_logical(),
## trans_dscr = col_character(),
## tCharger = col_logical(),
## sCharger = col_character(),
## atvType = col_character(),
## fuelType2 = col_logical(),
## rangeA = col_logical(),
## evMotor = col_logical(),
## mfrCode = col_logical(),
## c240Dscr = col_logical()
## # ... with 4 more columns
## )
## See spec(...) for full column specifications.
## Warning: 26930 parsing failures.
## row col expected actual file
## 4430 guzzler 1/0/T/F/TRUE/FALSE G 'big_epa_cars.csv'
## 4431 guzzler 1/0/T/F/TRUE/FALSE G 'big_epa_cars.csv'
## 4432 guzzler 1/0/T/F/TRUE/FALSE G 'big_epa_cars.csv'
## 4433 guzzler 1/0/T/F/TRUE/FALSE G 'big_epa_cars.csv'
## 4442 guzzler 1/0/T/F/TRUE/FALSE G 'big_epa_cars.csv'
## .... ....... .................. ...... ..................
## See problems(...) for more details.
epa.cars <- epa.cars%>%
mutate(
barrels = barrels08,
fuel_type = fuelType1,
mpg = comb08,
c02 = co2,
fuel_cost = fuelCost08
) %>%
select(
barrels,
fuel_type,
mpg,
c02,
fuel_cost,
drive,
make,
model,
year,
VClass
)
corrplot(cor(select(epa.cars,
year,
barrels,
mpg,
c02,
fuel_cost
),
use="complete.obs"))
One thing that we notice in this correlation matrix is that cars tend to get better mpg and use less total gas as the year increases. However, cars made more recently tend to produce more c02 emissions.
makefrequency <- aggregate(epa.cars$mpg,
by = list(epa.cars$make),
length)
colnames(makefrequency) <- c("Make", "Frequency")
makefrequency <- makefrequency[order(makefrequency$Frequency, decreasing = TRUE),]
makefrequency <- makefrequency[1:15,]
pie(makefrequency$Frequency, labels = makefrequency$Make)
Here we can see the percentage of car makes and models of the top 15 makes. We see that Chevrolet, Ford, and Dodge are the 3 biggest car producers in this dataset.
top5makes <- subset(epa.cars,
epa.cars$make %in%
c(
"Chevrolet",
"Ford",
"Dodge",
"GMC",
"Toyota"
))
medianmpg <- aggregate(top5makes$mpg,
by = list(top5makes$make, top5makes$year),
median)
colnames(medianmpg) <- c("make", "year", "median_mpg")
line <- ggplot(medianmpg, aes(x = year, y = median_mpg, group = make, col = make)) +
geom_line(stat = "identity") +
theme(axis.text.x = element_text(angle = 90))
ggplotly(line)
With this line chart we see that for the top 5 car makes, the fuel economy tends to get better over time. Also, of these 5 makes, Toyota has by far the highest median mpg.
ggplot(epa.cars, aes(x=fuel_type)) +
geom_bar()
Here we see that most of the cars take either premium or regular gasoline, with regular being by far the most common. A small percentage of cars use diesel, and very few use electricity, midgrade gasoline, or natural gas.
gasoline <- subset(epa.cars,
epa.cars$fuel_type %in%
c(
"Regular Gasoline",
"Premium Gasoline"
))
ggplot(gasoline, aes(x=fuel_type, y=mpg)) +
geom_boxplot()
This shows us whether or not premium gasoline is better than regular. Regular gasoline appears to have a wider range for mpg and has a slightly higher median mpg.
avgs <- aggregate(gasoline$mpg,
by = list(gasoline$fuel_type),
mean)
colnames(avgs) <- c("Fuel_Type", "Mean MPG")
avgs
## Fuel_Type Mean MPG
## 1 Premium Gasoline 19.77485
## 2 Regular Gasoline 20.35122
This shows us that the mean mpg for regular gasoline is slightly higher than for premium gasoline.
summary(aov(gasoline$mpg ~ gasoline$fuel_type))
## Df Sum Sq Mean Sq F value Pr(>F)
## gasoline$fuel_type 1 2807 2806.8 104.7 <2e-16 ***
## Residuals 40223 1078791 26.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here we can confirm that the means for premium and regular are actually different. The null hypothesis is that both means are equal and the p-value is very low, so we can conlude that the mean mpg for regular is slightly higher than for premium.